{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "WhisperForConditionalGeneration(\n",
       "  (model): WhisperModel(\n",
       "    (encoder): WhisperEncoder(\n",
       "      (conv1): Conv1d(128, 1280, kernel_size=(3,), stride=(1,), padding=(1,))\n",
       "      (conv2): Conv1d(1280, 1280, kernel_size=(3,), stride=(2,), padding=(1,))\n",
       "      (embed_positions): Embedding(1500, 1280)\n",
       "      (layers): ModuleList(\n",
       "        (0): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (1): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (2): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (3): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (4): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (5): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (6): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (7): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (8): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (9): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (10): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (11): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (12): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (13): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (14): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (15): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (16): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (17): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (18): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (19): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (20): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (21): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (22): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (23): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (24): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (25): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (26): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (27): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (28): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (29): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (30): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (31): WhisperEncoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (activation_fn): GELUActivation()\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "      )\n",
       "      (layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "    )\n",
       "    (decoder): WhisperDecoder(\n",
       "      (embed_tokens): Embedding(51866, 1280, padding_idx=50256)\n",
       "      (embed_positions): WhisperPositionalEmbedding(448, 1280)\n",
       "      (layers): ModuleList(\n",
       "        (0): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (1): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (2): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (3): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (4): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (5): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (6): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (7): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (8): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (9): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (10): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (11): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (12): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (13): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (14): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (15): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (16): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (17): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (18): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (19): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (20): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (21): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (22): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (23): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (24): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (25): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (26): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (27): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (28): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (29): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (30): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "        (31): WhisperDecoderLayer(\n",
       "          (self_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (activation_fn): GELUActivation()\n",
       "          (self_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (encoder_attn): WhisperAttention(\n",
       "            (k_proj): Linear(in_features=1280, out_features=1280, bias=False)\n",
       "            (v_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (q_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "            (out_proj): Linear(in_features=1280, out_features=1280, bias=True)\n",
       "          )\n",
       "          (encoder_attn_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "          (fc1): Linear(in_features=1280, out_features=5120, bias=True)\n",
       "          (fc2): Linear(in_features=5120, out_features=1280, bias=True)\n",
       "          (final_layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "        )\n",
       "      )\n",
       "      (layer_norm): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n",
       "    )\n",
       "  )\n",
       "  (proj_out): Linear(in_features=1280, out_features=51866, bias=False)\n",
       ")"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import torch\n",
    "from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline\n",
    "from datasets import load_dataset\n",
    "from tqdm import tqdm\n",
    "\n",
    "torch.set_float32_matmul_precision(\"high\")\n",
    "\n",
    "device = \"cuda:0\" if torch.cuda.is_available() else \"cpu\"\n",
    "torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32\n",
    "\n",
    "model_id = \"/data/models/huggingface/whisper-large-v3\"\n",
    "\n",
    "model = AutoModelForSpeechSeq2Seq.from_pretrained(\n",
    "    model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True\n",
    ").to(device)\n",
    "model\n",
    "\n",
    "# # Enable static cache and compile the forward pass\n",
    "# model.generation_config.cache_implementation = \"static\"\n",
    "# model.generation_config.max_new_tokens = 256\n",
    "# model.forward = torch.compile(model.forward, mode=\"reduce-overhead\", fullgraph=True)\n",
    "\n",
    "processor = AutoProcessor.from_pretrained(model_id)\n",
    "\n",
    "pipe = pipeline(\n",
    "    \"automatic-speech-recognition\",\n",
    "    model=model,\n",
    "    tokenizer=processor.tokenizer,\n",
    "    feature_extractor=processor.feature_extractor,\n",
    "    torch_dtype=torch_dtype,\n",
    "    device=device,\n",
    ")\n",
    "\n",
    "# dataset = load_dataset(\"distil-whisper/librispeech_long\", \"clean\", split=\"validation\")\n",
    "# sample = dataset[0][\"audio\"]\n",
    "\n",
    "# # 2 warmup steps\n",
    "# for _ in tqdm(range(2), desc=\"Warm-up step\"):\n",
    "#     with sdpa_kernel(SDPBackend.MATH):\n",
    "#         result = pipe(sample.copy(), generate_kwargs={\"min_new_tokens\": 256, \"max_new_tokens\": 256})\n",
    "\n",
    "# # fast run\n",
    "# with sdpa_kernel(SDPBackend.MATH):\n",
    "#     result = pipe(sample.copy())\n",
    "\n",
    "# print(result[\"text\"])"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
